library(tidyverse)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------------------------------------------------------
as.difftime(): lubridate, base
date(): lubridate, base
filter(): dplyr, stats
intersect(): lubridate, base
lag(): dplyr, stats
setdiff(): lubridate, base
union(): lubridate, base
library(stringr)
library(anytime)
library(lubridate)
library(skimr)
library(listviewer)
library(rlang)
Attaching package: ‘rlang’
The following objects are masked from ‘package:purrr’:
%||%, %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl, invoke, is_atomic,
is_bare_atomic, is_bare_character, is_bare_double, is_bare_integer, is_bare_list, is_bare_logical,
is_bare_numeric, is_bare_vector, is_character, is_double, is_empty, is_formula, is_function, is_integer,
is_list, is_logical, is_null, is_scalar_atomic, is_scalar_character, is_scalar_double, is_scalar_integer,
is_scalar_list, is_scalar_logical, is_scalar_vector, is_vector, list_along, prepend, rep_along, set_names,
splice
The following object is masked from ‘package:tibble’:
has_name
library(forcats)
library(ggjoy)
library(padr)
library(hrbrthemes)
hrbrthemes is under *active* development. See https://github.com/hrbrmstr/hrbrthemes for info/news.
library(janitor)
load("output/trennid_toodeldud.RData")
glimpse(trennid_toodeldud)
Observations: 75,808
Variables: 14
$ profile <dbl> 10005837, 10005837, 10005837, 10005837, 10005837, 10005837, 10005837, 10005837, 10005837, 10005837...
$ synnipaev <date> 1978-06-26, 1978-06-26, 1978-06-26, 1978-06-26, 1978-06-26, 1978-06-26, 1978-06-26, 1978-06-26, 1...
$ sugu <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male", "M...
$ pikkus_cm <dbl> 195, 195, 195, 195, 195, 195, 195, 195, 195, 195, 195, 195, 195, 180, 180, 169, 169, 169, 169, 169...
$ kaal_kg <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ lemmik_spordiala <chr> "Cycling (Sport)", "Cycling (Sport)", "Cycling (Sport)", "Cycling (Sport)", "Cycling (Sport)", "Cy...
$ workout <dbl> 519605661, 349744705, 345915031, 345404462, 344247597, 341731612, 340579183, 340106467, 339578378,...
$ workout_date <dttm> 2017-05-10 12:35:00, 2017-06-01 14:41:00, 2017-05-25 14:42:00, 2017-05-24 16:10:00, 2017-05-22 16...
$ spordiala <chr> "riding", "cycling", "cycling", "cycling", "cycling", "cycling", "doing weight training", "cycling...
$ trenni_kestvus <S4: Period> 59S, 1H 10M 22S, 1H 12M 16S, 59M 46S, 1H 13M 22S, 1H 7M 0S, 1H 29M 22S, 1H 9M 54S, 1H 14M 0...
$ distants_km <dbl> NA, 27.20, 27.36, 21.29, 30.07, 26.20, NA, 26.58, 27.16, 23.22, 6.55, 1.06, 2.84, NA, NA, NA, 4.01...
$ raw_profile_info <chr> "\n Country:\n Estonia\n Birthday:\n ...
$ raw_workout_date <chr> "May 10 at 11:35", "June 01 at 13:41", "May 25 at 13:42", "May 24 at 15:10", "May 22 at 15:58", "M...
$ raw_workout_info <chr> "was out riding for 0m:59s.", "was out cycling. He tracked 27.20 km in 1h:10m:22s.", "was out cycl...
Ülevaade kõigist veergudest
# Funktsioon leiab valitud veeru kõik unikaalsed väärtused koos esinemissagedusega
unikaalsed <- function(x){
valik <- sym(x)
p <- trennid_toodeldud %>%
mutate(uus = str_replace_na(!!valik)) %>%
count(uus) %>%
mutate(uus = str_c(uus, " (", n, ")"),
uus = fct_reorder(uus, n)) %>%
arrange(desc(n)) %>%
select(uus)
colnames(p) <- x
return(p)
}
# kõik veerud, mis pole ID või muul põhjusel välistatud
veerud <- trennid_toodeldud %>%
select(-profile, -workout, -contains("raw")) %>%
colnames()
# leia data frame kõigi veergude unikaalsed väärtused
p <- map(veerud, unikaalsed)
# kuva unikaalsed väärtused
p %>%
map(., as.list) %>%
flatten() %>%
jsonedit()
TOP 15 populaarsemat spordiala
top_15_spordiala <- trennid_toodeldud %>%
filter(spordiala != "exercising") %>% # välista mittemidagiütlev "exercising"
count(spordiala, sort = TRUE) %>%
head(15) %>%
.$spordiala
top_15_spordiala
[1] "walking" "running" "cycling" "mountain biking"
[5] "skating" "doing weight training" "skiing" "hiking"
[9] "doing aerobics" "doing circuit training" "swimming" "orienteering"
[13] "roller skiing" "golfing" "dancing"
TOP 10 lemmik spordiala
top_10_lemmik_spordiala <- trennid_toodeldud %>%
distinct(profile, lemmik_spordiala) %>%
mutate(lemmik_spordiala = case_when(.$lemmik_spordiala == "Walking (Fitness)" ~ "walking",
.$lemmik_spordiala == "Skiing (Cross country)" ~ "skiing",
.$lemmik_spordiala == "Football (Soccer)" ~ "soccer",
TRUE ~ .$lemmik_spordiala),
lemmik_spordiala = str_to_lower(lemmik_spordiala)) %>%
count(lemmik_spordiala, sort = TRUE) %>%
head(10) %>%
.$lemmik_spordiala
top_10_lemmik_spordiala
[1] "walking" "running" "cycling (sport)" "cycling (transport)" "mountain biking"
[6] "roller skating" "weight training" "skiing" "orienteering" "soccer"
Populaarsemad spordialad eesti keeles
spordiala_est <- tribble(
~spordiala, ~spordiala_est,
"walking", "käimine",
"running", "jooksmine",
"cycling", "jalgrattasõit",
"cycling (sport)", "jalgrattasõit (sport)",
"cycling (transport)", "jalgrattasõit (transport)",
"mountain biking", "maastikuratta sõit",
"skating", "rulluisutamine",
"roller skating", "rulluisutamine",
"doing weight training", "jõutrenn",
"weight training", "jõutrenn",
"skiing", "suusatamine",
"hiking", "matkamine",
"dancing", "tantsimine",
"doing aerobics", "aeroobika",
"doing circuit training", "ringtreening",
"swimming", "ujumine",
"orienteering", "orienteerumine",
"roller skiing", "rullsuusatamine",
"golfing", "golf",
"soccer", "jalgpall"
)
Ülevaade analüüsis kasutatud trennide ja inimetse arvust
trennid_toodeldud %>%
filter(!is.na(spordiala)) %>%
group_by(spordiala) %>%
summarise(trennide_arv = n(),
kasutajate_arv = n_distinct(profile)) %>%
ungroup() %>%
mutate(label = str_c(trennide_arv, " (", kasutajate_arv, ")")) %>%
arrange(desc(trennide_arv)) %>%
head(30) %>%
ggplot(aes(fct_reorder(spordiala, trennide_arv), trennide_arv)) +
geom_col() +
geom_text(aes(label = label), hjust = -0.1) +
scale_y_continuous(breaks = (seq(0, 35000, by = 10000)),
limits = c(0, 35000),
expand = c(0, 0)) +
coord_flip() +
theme_ipsum_rc() +
labs(y = "trennide arv",
x = "spordiala",
title = "Trennide ja treenijate arv kokku",
subtitle = "TOP 30 spordiala\nSulgudes on spordialaga tegelenud inimeste arv")
Mis kell tehakse trenni nädalavahetusel?
top_trennid_nadalavahetusel <- trennid_toodeldud %>%
filter(spordiala %in% top_15_spordiala) %>% # ainult top 15 alad
left_join(spordiala_est) %>%
filter(!is.na(workout_date)) %>% # välista ilma kellaajata trennid
mutate(nadalapaev = weekdays(workout_date)) %>% # leia trenni nädalapäev
filter(nadalapaev %in% c("Saturday", "Sunday")) %>% # ainult nädalavahetused
thicken(by = "workout_date", interval = "15 mins") %>% # ümarda kõik kellaajad 15 min täpsusega
# asenda kõik kuupäevad 01.01.2017
# nii on lihtsam andmed ühele skaalale viia
mutate(aeg = ymd_hms(str_replace(workout_date_15_min, "201.-..-..", "2017-01-01"))) %>%
select(spordiala_est, aeg) %>%
group_by(spordiala_est) %>%
mutate(n = n()) %>%
filter(aeg > ymd_hms("2017-01-01 06:00:00")) %>% # ainult pärast kl 6 tehtud trennid
ungroup() %>%
mutate(n_total = n(), # kogu trenni arvu kuvamiseks
spordiala_est = fct_reorder(spordiala_est, n)) %>%
arrange(spordiala_est, aeg)
Joining, by = "spordiala"
Datetime variable was unsorted, result will be unsorted as well.
top_trennid_nadalavahetusel %>%
ggplot(aes(aeg, spordiala_est)) +
geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
theme_ipsum_rc() +
scale_y_discrete(expand = c(0.01, 0)) +
# formati x-teljel kellaaeg
scale_x_datetime(labels = function(x) format(x, "%H:%M"),
breaks = seq(ymd_hms("2017-01-01 06:00:00"),
ymd_hms("2017-01-01 24:00:00"), "3 hours"),
expand = c(0, 0)) +
scale_fill_cyclical(values = c("#3182bd", "#6baed6")) + # vahelduvad värvitoonid
labs(x = "kellaaeg",
y = "spordiala",
title = "Mis kell tehakse trenni nädalavahetusel?",
subtitle = str_c("ca ", round(max(top_trennid_nadalavahetusel$n_total, na.rm = TRUE) / 1000, 0),
" 000 Endomondos logitud trenni põhjal"))
Mis kell tehakse trenni tööpäevadel?
top_trennid_toopaevadel <- trennid_toodeldud %>%
filter(spordiala %in% top_15_spordiala) %>%
left_join(spordiala_est) %>%
filter(!is.na(workout_date)) %>%
mutate(nadalapaev = weekdays(workout_date)) %>%
filter(!nadalapaev %in% c("Saturday", "Sunday")) %>%
thicken(by = "workout_date", interval = "15 mins") %>%
mutate(aeg = ymd_hms(str_replace(workout_date_15_min, "201.-..-..", "2017-01-01"))) %>%
select(spordiala_est, aeg, profile) %>%
group_by(spordiala_est) %>%
mutate(n = n()) %>%
select(-profile) %>%
filter(aeg > ymd_hms("2017-01-01 06:00:00")) %>%
ungroup() %>%
mutate(n_total = n(),
spordiala_est = fct_reorder(spordiala_est, n)) %>%
arrange(spordiala_est, aeg)
Joining, by = "spordiala"
Datetime variable was unsorted, result will be unsorted as well.
top_trennid_toopaevadel %>%
ggplot(aes(aeg, spordiala_est)) +
geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
# theme_joy() +
theme_ipsum_rc() +
scale_y_discrete(expand = c(0.01, 0)) +
scale_x_datetime(labels = function(x) format(x, "%H:%M"),
breaks = seq(ymd_hms("2017-01-01 06:00:00"),
ymd_hms("2017-01-01 24:00:00"), "3 hours"),
expand = c(0, 0)) +
scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
labs(x = "kellaaeg",
y = "spordiala",
title = "Mis kell tehakse trenni tööpäevadel?",
subtitle = str_c("ca ", round(max(top_trennid_toopaevadel$n_total, na.rm = TRUE) / 1000, 0),
" 000 Endomondos logitud trenni põhjal"))
Mis nädalapäevadel mingit spordiala harrastatakse?
top_trennid_paevade_loikes <- trennid_toodeldud %>%
filter(spordiala %in% top_15_spordiala) %>%
left_join(spordiala_est) %>%
filter(!is.na(workout_date)) %>%
# nädalapäeva number (1 = esmaspäev)
mutate(nadalapaev = wday(workout_date, week_start = 1)) %>%
select(spordiala_est, nadalapaev, profile) %>%
group_by(spordiala_est) %>%
mutate(n = n()) %>%
select(-profile) %>%
ungroup() %>%
mutate(n_total = n(),
spordiala_est = fct_reorder(spordiala_est, n)) %>%
arrange(spordiala_est, nadalapaev)
Joining, by = "spordiala"
top_trennid_paevade_loikes %>%
ggplot(aes(nadalapaev, spordiala_est)) +
geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
geom_vline(xintercept = 5.5) +
annotate("text", x = 6.5, y = 16.5, label = "nädalavahetuse") +
theme_ipsum_rc() +
scale_y_discrete(expand = c(0.01, 0)) +
scale_x_continuous(breaks = seq(1, 7, by = 1),
labels = c("esmasp", "teisip", "kolmap", "neljap", "reede", "laup", "pühap"),
expand = c(0, 0)) +
scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
labs(x = "päev",
y = "spordiala",
title = "Mis päeval spordiala harrastatakse?",
subtitle = str_c("ca ", round(max(top_trennid_paevade_loikes$n_total, na.rm = TRUE) / 1000, 0),
" 000 Endomondos logitud trenni põhjal"))
Kuidas jagunevad spordialad trenni kestvuse lõikes?
top_trennid_kestvus <- trennid_toodeldud %>%
# arvuta iga trenni pikkus minutites
mutate(kestvus_minutites = as.numeric(as.duration(trenni_kestvus)) / 60) %>%
filter(spordiala %in% top_15_spordiala) %>%
left_join(spordiala_est) %>%
filter(!is.na(workout_date),
kestvus_minutites <= 120) %>% # max 2 H pikad trennid
select(spordiala_est, kestvus_minutites, profile) %>%
group_by(spordiala_est) %>%
mutate(n = n()) %>%
select(-profile) %>%
ungroup() %>%
mutate(n_total = n(),
spordiala_est = fct_reorder(spordiala_est, n)) %>%
arrange(spordiala_est, kestvus_minutites)
longer object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthJoining, by = "spordiala"
top_trennid_kestvus %>%
ggplot(aes(kestvus_minutites, spordiala_est)) +
geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
theme_ipsum_rc() +
scale_y_discrete(expand = c(0.01, 0)) +
scale_x_continuous(breaks = seq(0, 120, by = 30),
expand = c(0, 0)) +
scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
labs(x = "kestvus, min",
y = "spordiala",
title = "Kui pikalt trenni tehakse?",
subtitle = str_c("ca ", round(max(top_trennid_kestvus$n_total, na.rm = TRUE) / 1000, 0),
" 000 Endomondos logitud trenni põhjal"))
Kuidas jagunevad spordialade lõikes inimeste vanus?
top_trennid_vanus <- trennid_toodeldud %>%
filter(spordiala %in% top_15_spordiala) %>%
left_join(spordiala_est) %>%
mutate(vanus_praegu = round(as.numeric(difftime(today(), synnipaev, units = "days")) / 365, 0)) %>%
filter(!is.na(synnipaev),
vanus_praegu >= 10,
vanus_praegu <= 70) %>%
select(spordiala_est, vanus_praegu, profile) %>%
group_by(spordiala_est) %>%
mutate(n = n()) %>%
distinct(profile, spordiala_est, .keep_all = TRUE) %>%
ungroup() %>%
mutate(n_total = n_distinct(profile),
spordiala_est = fct_reorder(spordiala_est, n)) %>%
select(-profile) %>%
arrange(spordiala_est, vanus_praegu)
Joining, by = "spordiala"
top_trennid_vanus %>%
ggplot(aes(vanus_praegu, spordiala_est)) +
geom_joy(aes(fill = spordiala_est), scale = 2, colour = "white", size = 0.7) +
theme_ipsum_rc() +
scale_y_discrete(expand = c(0.01, 0)) +
scale_x_continuous(breaks = seq(10, 70, by = 10),
limits = c(10, 70),
expand = c(0, 0)) +
scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
labs(x = "vanus",
y = "spordiala",
title = "Mis vanuses trenni tehakse?",
subtitle = str_c("ca ", round(max(top_trennid_vanus$n_total, na.rm = TRUE) / 1000, 0),
" 000 Endomondo konto andmetel"))
Vanus ja lemmik spordiala.
top_trennid_lemmik_spordiala_vanus <- trennid_toodeldud %>%
# grupeeri mõned spordialad kokku
mutate(lemmik_spordiala = case_when(.$lemmik_spordiala == "Walking (Fitness)" ~ "walking",
.$lemmik_spordiala == "Skiing (Cross country)" ~ "skiing",
.$lemmik_spordiala == "Football (Soccer)" ~ "soccer",
TRUE ~ .$lemmik_spordiala),
lemmik_spordiala = str_to_lower(lemmik_spordiala)) %>%
filter(lemmik_spordiala %in% top_10_lemmik_spordiala) %>%
left_join(spordiala_est, by = c("lemmik_spordiala" = "spordiala")) %>%
mutate(vanus_praegu = round(as.numeric(difftime(today(), synnipaev, units = "days")) / 365, 0)) %>%
filter(!is.na(synnipaev),
vanus_praegu >= 10,
vanus_praegu <= 70,
!is.na(lemmik_spordiala)) %>%
select(lemmik_spordiala_est = spordiala_est, vanus_praegu, profile) %>%
group_by(lemmik_spordiala_est) %>%
mutate(n = n()) %>%
# iga kasutaja ühekordselt
distinct(profile, lemmik_spordiala_est, .keep_all = TRUE) %>%
ungroup() %>%
mutate(n_total = n_distinct(profile),
lemmik_spordiala_est = fct_reorder(lemmik_spordiala_est, n)) %>%
select(-profile) %>%
arrange(lemmik_spordiala_est, vanus_praegu)
top_trennid_lemmik_spordiala_vanus %>%
ggplot(aes(vanus_praegu, lemmik_spordiala_est)) +
geom_joy(aes(fill = lemmik_spordiala_est), scale = 2, colour = "white", size = 0.7) +
theme_ipsum_rc() +
scale_y_discrete(expand = c(0.01, 0)) +
scale_x_continuous(breaks = seq(10, 70, by = 10),
limits = c(10, 70),
expand = c(0, 0)) +
scale_fill_cyclical(values = c("#3182bd", "#6baed6")) +
labs(x = "vanus",
y = "lemmik spordiala",
title = "Lemmik spordiala vs vanus",
subtitle = str_c("ca ", round(max(top_trennid_lemmik_spordiala_vanus$n_total, na.rm = TRUE) / 1000, 0),
" 000 Endomondo konto andmetel"))
Kuidas mõjutab vanus mediaan keskmist jooksu kiirust?
trennid_toodeldud %>%
ungroup() %>%
mutate(kestvus_minutites = as.numeric(as.duration(trenni_kestvus)) / 60) %>%
filter(!is.na(synnipaev),
spordiala == "running",
!is.na(trenni_kestvus),
!is.na(sugu)) %>%
mutate(vanus_praegu = as.numeric(difftime(today(), synnipaev, units = "days")) / 365,
kiirus = kestvus_minutites / distants_km) %>%
# filter(kiirus < 15, kiirus > 1) %>%
group_by(profile, vanus_praegu, sugu) %>%
summarise(kiirus_median = median(kiirus, na.rm = TRUE),
n = n()) %>%
filter(n >= 10, kiirus_median <= 10, kiirus_median >= 3) %>%
ungroup() %>%
ggplot(aes(vanus_praegu, kiirus_median, colour = sugu)) +
geom_point() +
theme_ipsum_rc() +
labs(x = "vanus",
y = "tempo min/km",
title = "Keskmise jooksu tempo ja vanuse seos",
subtitle = "Iga punkt tähistab ühe inimese mediaan tempot")
longer object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object length
Milline on vanuse ja mediaan keskmine joksu distantsi vaheline seos?
trennid_toodeldud %>%
ungroup() %>%
filter(!is.na(synnipaev),
spordiala == "running",
!is.na(distants_km),
!is.na(sugu)) %>%
mutate(vanus_praegu = as.numeric(difftime(today(), synnipaev, units = "days")) / 365) %>%
group_by(profile, vanus_praegu, sugu) %>%
summarise(distance_median = median(distants_km, na.rm = TRUE),
n = n()) %>%
filter(n >= 10, vanus_praegu >= 10, distance_median <= 15) %>%
ungroup() %>%
ggplot(aes(vanus_praegu, distance_median, colour = sugu)) +
geom_point() +
theme_ipsum_rc() +
labs(x = "vanus",
y = "distants, km",
title = "Keskmise jooksu distantsi ja vanuse seos",
subtitle = "Iga punkt tähistab ühe inimese mediaan distantsi")